#PACKAGES
packages <- c("sp.scRNAseq", "sp.scRNAseqData", "ggthemes", "tidyverse", "seqTools")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)
Identify DE genes between colon and SI stem cells via single cell RNA-seq data from mouse. This was perfored using 2 different strategies: a) pairwise comparison of all cell types as defined by unsupervised classification and b) comparison of the top 50 Lgr5+ colon vs small intestine cells from the stem cell population (as defined above).
First we run unsupervised classificaiton using t-SNE and the Mclust software.
#setup spCounts
s <- str_detect(colnames(countsMgfp), "^s")
countsMgfpMeta <- countsMgfpMeta %>%
mutate(mouseName = if_else(
str_detect(plate, "NJA004"),
"C57black/6J", "C57black/6J/Lgr5gfp"
))
m <- colnames(countsMgfp) %in% filter(countsMgfpMeta, mouseName == "C57black/6J")$sample
cObjSng <- spCounts(countsMgfp[, s & m], cbind(countsMgfpERCC[, s & m]))
cObjMul <- spCounts(countsMgfp[, !s & m], cbind(countsMgfpERCC[, !s & m]))
#spUnsupervised
load("~/Github/sp.scRNAseqTesting/inst/differentialExpLgr5ColonSI/uObj.rda")
Plot unsupervised classification results.
plotUnsupervisedClass(uObj, cObjSng)
Show marker expression specific for the cell types in the tissue.
plotUnsupervisedMarkers(
uObj, cObjSng,
c("Lgr5", "Muc2", "Ptprc", "Chga", "Alpi", "Lyz1", "Dclk1", "Slc40a1"),
pal = RColorBrewer::brewer.pal(8, "Set1")
)
Show the tissues.
plotUnsupervisedClass(uObj, cObjSng) %>%
plotData() %>%
inner_join(countsMgfpMeta, by = c("Sample" = "sample")) %>%
ggplot() +
geom_point(aes(`t-SNE dim 1`, `t-SNE dim 2`, colour = tissue)) +
theme_few() +
theme(legend.position = "top") +
guides(colour = guide_legend(title = "Tissue"))
Rename the classes
classifications <- tibble(
sample = rownames(getData(uObj, "tsne")),
oldClass = getData(uObj, "classification")
) %>%
mutate(newClass = case_when(
oldClass == "C1" ~ "C.Stem",
oldClass == "F1" ~ "C.Colonocyte",
oldClass == "B1" ~ "C.Goblet",
oldClass == "G1" ~ "Blood.Tufft.Endocrine",
oldClass == "D1" ~ "SI.Stem",
oldClass == "A1" ~ "SI.Enterocyte",
oldClass == "E1" ~ "SI.Goblet",
TRUE ~ "error"
))
classification(uObj) <- classifications$newClass[match(rownames(getData(uObj, "tsne")), classifications$sample)]
groupMeans(uObj) <- averageGroupExpression(cObjSng, getData(uObj, "classification"), FALSE)
tsneMeans(uObj) <- tsneGroupMeans(getData(uObj, "tsne"), getData(uObj, "classification"))
plotUnsupervisedClass(uObj, cObjSng)
Show Lgr5 expression.
plotUnsupervisedMarkers(uObj, cObjSng, "Lgr5")
Expression is log2(cpm) and then scaled to the interval [0, 1] with: (exp - min(exp)) / (max(exp) - min(exp)).
Show Lgr5 expression per class and tissue.
plotUnsupervisedMarkers(uObj, cObjSng, "Lgr5") %>%
plotData() %>%
inner_join(countsMgfpMeta, by = c("Sample" = "sample")) %>%
ggplot() +
geom_histogram(
aes(Lgr5, fill = tissue, y = ..count.., group = tissue), binwidth = 0.05, position = position_dodge(),
alpha = 0.7
) +
facet_wrap(~Classification, scales = "free_y") +
theme_bw() +
theme(legend.position = "top", plot.caption = element_text(hjust = 0)) +
guides(fill = guide_legend(title = "Tissue")) +
labs(
x = "Lgr5 expression: log2(cpm) scaled to the interval [0, 1] with: (exp - min(exp)) / (max(exp) - min(exp))."
)
Pairwise KStest using all classes.
ks.res <- KStest(getData(cObjSng, "counts.cpm"), getData(uObj, "classification"), cores = 8)
p.ks.res <- processKStest(ks.res, getData(uObj, "classification"), 0.05) %>%
filter(sigBool) %>%
group_by(gene) %>%
filter(n() == 1) %>%
ungroup() %>%
mutate(p.prod = map_dbl(p.values, prod)) %>%
mutate(meanCpm = map2_dbl(id, gene, function(i, g) {
mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i])
})) %>%
mutate(log2.fc = map2_dbl(id, gene, function(i, g) {
log2(
mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i]) /
mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") != i])
)
})) %>%
arrange(desc(log2.fc), p.prod, desc(statSum), desc(meanCpm))
save(p.ks.res, file = './differentialExpressionStrategy1.rda', compress = "bzip2")
openxlsx::write.xlsx(p.ks.res, file = "./differentialExpressionStrategy1.xlsx")
Number of significant genes detected at alpha 0.05?
p.ks.res %>% count(id)
## # A tibble: 7 x 2
## id n
## <chr> <int>
## 1 Blood.Tufft.Endocrine 374
## 2 C.Colonocyte 133
## 3 C.Goblet 90
## 4 C.Stem 38
## 5 SI.Enterocyte 213
## 6 SI.Goblet 70
## 7 SI.Stem 158
Number of significant genes detected at alpha 0.01?
processKStest(ks.res, getData(uObj, "classification"), 0.01) %>%
filter(sigBool) %>%
group_by(gene) %>%
filter(n() == 1) %>%
ungroup() %>%
mutate(p.prod = map_dbl(p.values, prod)) %>%
count(id)
## # A tibble: 7 x 2
## id n
## <chr> <int>
## 1 Blood.Tufft.Endocrine 240
## 2 C.Colonocyte 95
## 3 C.Goblet 56
## 4 C.Stem 25
## 5 SI.Enterocyte 126
## 6 SI.Goblet 39
## 7 SI.Stem 92
Number of significant genes detected at alpha 0.001?
processKStest(ks.res, getData(uObj, "classification"), 0.001) %>%
filter(sigBool) %>%
group_by(gene) %>%
filter(n() == 1) %>%
ungroup() %>%
mutate(p.prod = map_dbl(p.values, prod)) %>%
count(id)
## # A tibble: 7 x 2
## id n
## <chr> <int>
## 1 Blood.Tufft.Endocrine 149
## 2 C.Colonocyte 59
## 3 C.Goblet 31
## 4 C.Stem 22
## 5 SI.Enterocyte 74
## 6 SI.Goblet 21
## 7 SI.Stem 67
Number of significant genes detected at alpha 0.05 and fold change > 2?
p.ks.res %>%
filter(log2.fc > 1 | log2.fc < -1) %>%
count(id)
## # A tibble: 7 x 2
## id n
## <chr> <int>
## 1 Blood.Tufft.Endocrine 347
## 2 C.Colonocyte 79
## 3 C.Goblet 77
## 4 C.Stem 17
## 5 SI.Enterocyte 126
## 6 SI.Goblet 60
## 7 SI.Stem 66
Plot some candidates in the context of all cells.
#dotplot function
dotplot <- function(data) {
data %>%
rename(Gene = gene) %>%
ggplot() +
geom_dotplot(
data = . %>% filter(expression != 0),
aes(Classification, expression, colour = Gene, fill = Gene, group = interaction(Classification, Gene)),
binaxis="y", stackdir="center", binwidth = 0.01, position = position_dodge()
) +
geom_text(
data = . %>%
group_by(Classification, Gene) %>%
summarize(do = paste0(round(length(which(expression == 0)) / n(), digits = 2) * 100, "%")),
aes(Classification, 0.05, label = do), size = 2
) +
geom_point(
data = . %>%
group_by(Gene, Classification) %>%
summarize(median = median(expression)) %>%
filter(median != 0),
aes(Classification, median), shape = 95, size = 3
) +
ylim(0, 1) +
theme_few() +
theme(axis.text.x = element_text(angle = 90), legend.position = "top") +
facet_wrap(~Gene) +
labs(
x = "Tissue",
y = "log2(expression)[0, 1]"
) +
guides(fill = FALSE, colour = FALSE)
}
SI stem genes
#plot SI candidates
p.ks.res %>%
filter(id == "SI.Stem") %>%
arrange(desc(log2.fc), p.prod, desc(statSum), desc(meanCpm)) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()
Colon stem genes
p.ks.res %>%
filter(id == "C.Stem") %>%
arrange(desc(log2.fc), p.prod, desc(statSum), desc(meanCpm)) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()
Results were saved to the file “differentialExpressionStrategy1” with the following columns:
Id: the cell type that the corresponding gene has been found to be differentially expressed in. Gene: the name of the gene sigBool: a logical indicating if all pairwise tests were significant (alpha = 0.001) statSum: the sum of test statistics from the KS tests p.values: the individual p values from each pairwise test that was performed for the corresponding gene Statistics: the individual test statistics from each pairwise test that was performed for the corresponding gene p.prod: the product of the p.values from all pairwise tests meanCpm: the mean counts per million expression values for the gene in the cell type listed in the “id” column Log2.fc: the log 2 fold change comparing the cell type listed in the “id” column vs all other cell types as a group.
KStest using only the top 50 cells in each of the C.Stem and S.Stem classes with highest Lgr5 and comparing only these classes.
idx <- which(getData(uObj, "classification") %in% c("C.Stem", "SI.Stem"))
counts <- getData(cObjSng, "counts.cpm")[, idx]
classes <- getData(uObj, "classification")[idx]
#find top 50 cells in each class with highest Lgr5
top50 <- tibble(Lgr5 = counts["Lgr5", ], class = classes, sample = colnames(counts)) %>%
group_by(class) %>%
top_n(50, Lgr5) %>%
ungroup()
ks.res2 <- KStest(counts[, match(top50$sample, colnames(counts))], top50$class, cores = 8)
p.ks.res2 <- ks.res2 %>%
filter(p.value < 0.001) %>%
separate(combination, c("Celltype1", "Celltype2"), sep = "-") %>%
mutate(
meanCpmCelltype1 = map2_dbl(Celltype1, gene, function(i, g) {
mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i])
}),
meanCpmCelltype2 = map2_dbl(Celltype2, gene, function(i, g) {
mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i])
})
) %>%
mutate(log2.fc = log2(meanCpmCelltype1 / meanCpmCelltype2)) %>%
arrange(desc(log2.fc), p.value)
save(p.ks.res2, file = '~/Github/sp.scRNAseq/inst/differentialExpLgr5ColonSI/differentialExpressionStrategy2.rda', compress = "bzip2")
openxlsx::write.xlsx(p.ks.res2, file = "~/Github/sp.scRNAseq/inst/differentialExpLgr5ColonSI/differentialExpressionStrategy2.xlsx")
Number of significant genes detected at alpha 0.05?
ks.res2 %>%
filter(p.value < 0.05) %>%
count(combination)
## # A tibble: 1 x 2
## combination n
## <chr> <int>
## 1 C.Stem-SI.Stem 1160
Number of significant genes detected at alpha 0.01?
ks.res2 %>%
filter(p.value < 0.01) %>%
count(combination)
## # A tibble: 1 x 2
## combination n
## <chr> <int>
## 1 C.Stem-SI.Stem 536
Number of significant genes detected at alpha 0.001?
ks.res2 %>%
filter(p.value < 0.001) %>%
count(combination)
## # A tibble: 1 x 2
## combination n
## <chr> <int>
## 1 C.Stem-SI.Stem 274
Number of significant genes detected at alpha 0.05 and fold change > 2?
p.ks.res2 %>%
filter(log2.fc > 1 | log2.fc < -1) %>%
unite(combination, Celltype1, Celltype2) %>%
count(combination)
## # A tibble: 1 x 2
## combination n
## <chr> <int>
## 1 C.Stem_SI.Stem 195
Plot some candidates in the context of all cells.
SI stem genes
p.ks.res2 %>%
filter(log2.fc < 1) %>%
arrange(log2.fc, p.value) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()
Colon stem genes
p.ks.res2 %>%
filter(log2.fc > 1) %>%
arrange(desc(log2.fc), p.value) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()
Results were saved to the file “differentialExpressionStrategy2” with the following columns:
Cell type 1: 1st cell type in comparison Cell type 2: 2nd cell type in comparison Gene: the name of the gene Stat: the test statistic from the KS test that was performed for the corresponding gene p.value: the p values from the KS test that was performed for the corresponding gene meanCpmCellType1: mean counts per million for cell type 1 meanCpmCellType2: mean counts per million for cell type 2 Log2.fc: the log 2 fold change comparing the two cell types.
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 seqTools_0.0.0.9000
## [3] forcats_0.3.0 stringr_1.3.1
## [5] dplyr_0.7.5 purrr_0.2.5
## [7] readr_1.1.1 tidyr_0.8.1
## [9] tibble_1.4.2 ggplot2_2.2.1
## [11] tidyverse_1.2.1 ggthemes_3.5.0
## [13] sp.scRNAseqData_0.0.1.0 sp.scRNAseq_0.0.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 matrixStats_0.53.1 lubridate_1.7.4
## [4] RColorBrewer_1.1-2 httr_1.3.1 rprojroot_1.3-2
## [7] tools_3.5.0 backports_1.1.2 utf8_1.1.4
## [10] R6_2.2.2 lazyeval_0.2.1 BiocGenerics_0.26.0
## [13] colorspace_1.3-2 tidyselect_0.2.4 gridExtra_2.3
## [16] mnormt_1.5-5 compiler_3.5.0 cli_1.0.0
## [19] rvest_0.3.2 xml2_1.2.0 labeling_0.3
## [22] scales_0.5.0 psych_1.8.4 digest_0.6.15
## [25] foreign_0.8-70 rmarkdown_1.9 pkgconfig_2.0.1
## [28] htmltools_0.3.6 rlang_0.2.1 readxl_1.1.0
## [31] rstudioapi_0.7 bindr_0.1.1 jsonlite_1.5
## [34] mclust_5.4 zip_1.0.0 magrittr_1.5
## [37] Rcpp_0.12.17 munsell_0.4.3 S4Vectors_0.18.2
## [40] viridis_0.5.1 stringi_1.2.2 yaml_2.1.19
## [43] ggraph_1.0.1 MASS_7.3-50 Rtsne_0.13
## [46] plyr_1.8.4 grid_3.5.0 parallel_3.5.0
## [49] ggrepel_0.8.0 crayon_1.3.4 udunits2_0.13
## [52] lattice_0.20-35 haven_1.1.1 hms_0.4.2
## [55] knitr_1.20 pillar_1.2.3 igraph_1.2.1
## [58] pso_1.0.3 reshape2_1.4.3 stats4_3.5.0
## [61] glue_1.2.0 evaluate_0.10.1 modelr_0.1.2
## [64] tweenr_0.1.5 cellranger_1.1.0 gtable_0.2.0
## [67] assertthat_0.2.0 ggforce_0.1.2 openxlsx_4.1.0
## [70] broom_0.4.4 tidygraph_1.1.0 e1071_1.6-8
## [73] class_7.3-14 viridisLite_0.3.0 units_0.5-1